knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
library(tidyverse) #used for data wrangling and viz
library(here) #simplifies file paths
library(rsample) #used to split data
library(janitor) #used to clean data names
library(corrplot) #for correlation viz
library(VIM) #for missing data viz
library(missMDA) #for imputing missing data
library(ggfortify) #for PCA viz
library(fastDummies) #for dummy coding
library(caret) #for cross validation
library(class) #for knn
library(gbm) #for boosted trees modeling
library(randomForest) #for random forest modeling
library(maptools)
library(RColorBrewer)
library(classInt)
library(ggplot2)
library(ggrepel)
library(mapproj)
library(viridis)
#turns scientific notation off
options(scipen = 100)
#some of our workflow includes randomness. here we set a seed so our workflow can be reproduced without worrying about random variation
set.seed(123) The purpose of this project is to use machine learning techniques to predict which wildfires will grow to a catastrophic size. Our most accurate model will then be used to assess how global climate change may influence a wildfires predicted size.
fire situation in North America, climate change, etc
What will it be used for?
This dataset is available on Kaggle. It is a subset of larger fires
Important attributes include
fire_size - the size of the fire in acres
stat_cause_descr - the cause of the fire
vegetation - code corresponding to vegetation type
temp_cont - temperature (Celsius) when the fire was contained
A complete codebook is available in the project file
#read in dataset
us_wildfire <- read_csv(here("archive", "FW_Veg_Rem_Combined.csv"))
#Following to the National Geographic Education
us_wildfire$region = "NorthEast"
us_wildfire[which(us_wildfire$state %in% c("AK", "WA", "OR", "CA", "NV", "ID", "MT", "WY", "UT", "CO")),"region"] = "West"
us_wildfire[which(us_wildfire$state %in% c("AZ", "NM", "TX", "OK")), "region"] = "SouthWest"
us_wildfire[which(us_wildfire$state %in% c("ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "MI", "IN", "OH")), "region"] = "MidWest"
us_wildfire[which(us_wildfire$state %in% c("AR", "LA", "MS", "AL", "GA", "FL", "SC", "NC", "TN", "KY", "VA", "WV")), "region"] = "SouthEast"
us_wildfire$year_diff = us_wildfire$disc_pre_year - min(us_wildfire$disc_pre_year)Overview of methods
what we did to clean the data
#First, there are a couple of junk columns in the data, so we select only the columns that mean something, then we use janitor's clean names function for lowercase snake col names
us_wildfire_clean <- us_wildfire %>%
dplyr::select(fire_name:year_diff) %>%
clean_names()
#we are interested in using weather to predict fire duration, so we filter out observations that do not have a weather file
us_wildfire_clean <- us_wildfire_clean %>%
filter(weather_file != "File Not Found")
#Here we label vegetation according to the provided codebook
us_wildfire_clean <- us_wildfire_clean %>%
mutate(vegetation_classed = case_when(
vegetation == 12 ~ "Open Shrubland",
vegetation == 15 ~ "Polar Desert/Rock/Ice",
vegetation == 16 ~ "Secondary Tropical Evergreen Broadleaf Forest",
vegetation == 4 ~ "Temperate Evergreen Needleleaf Forest TmpENF",
vegetation == 9 ~ "C3 Grassland/Steppe",
vegetation == 14 ~ "Desert"
))
#According the metadata for this data set, the vegetation was created by interpolating most likely vegetation based on latitude and longitude. The most common vegetation type is listed as "Polar Desert/Rock/Ice" and this seems very unlikely.
#There are some weather observations for which every weather field is 0. this seems unlikely. so we will replace them with NA
us_wildfire_clean <- us_wildfire_clean %>%
mutate_at(vars(temp_pre_30:hum_cont), ~na_if(., 0.0000000))
#precipitation is frequently zero, so we will not replace all zeros with NAs. Instead we will follow the pattern of NAs seen in other weather columns.
#we can see that there is a clear pattern in the missing weather data. when temp_pre_30 is missing, so is wind_pre_30 and humidity_pre_30. We will assume this extends to precipitation
us_wildfire_clean <- us_wildfire_clean %>%
mutate(prec_pre_30 = case_when(is.na(temp_pre_30) & is.na(hum_pre_30) & is.na(wind_pre_30) ~ NA_real_,
TRUE ~ prec_pre_30)) %>%
mutate(prec_pre_15 = case_when(is.na(temp_pre_15) & is.na(hum_pre_15) & is.na(wind_pre_15) ~ NA_real_,
TRUE ~ prec_pre_15)) %>%
mutate(prec_pre_7 = case_when(is.na(temp_pre_7) & is.na(hum_pre_7) & is.na(wind_pre_7) ~ NA_real_,
TRUE ~ prec_pre_7)) %>%
mutate(prec_cont = case_when(is.na(temp_cont) & is.na(hum_cont) & is.na(wind_cont) ~ NA_real_,
TRUE ~ prec_cont))
#there are multiple date columns. however, since full dates are mostly missing, we will only keep month and year as variables
us_wildfire_clean <- us_wildfire_clean %>%
dplyr::select(-disc_clean_date,
-disc_date_pre,
-cont_clean_date,
-disc_pre_month,
-disc_date_final,
-cont_date_final,
-putout_time)
#we also can reclass months into seasons, to reduce factor levels
us_wildfire_clean <- us_wildfire_clean %>%
mutate(season = case_when(discovery_month %in% c("Apr", "May", "Jun") ~ "Spring",
discovery_month %in% c("Jul", "Aug", "Sep") ~ "Summer",
discovery_month %in% c("Oct", "Nov", "Dec") ~ "Fall",
discovery_month %in% c("Jan", "Feb", "Mar") ~ "Winter")) %>%
select(-discovery_month)~ a few key graphs, exploring data
#summarise acres per year burned
acres_per_year <- us_wildfire_clean %>%
group_by(disc_pre_year) %>%
summarise(acres_burned = sum(fire_size))
#fire size (finalized graph)
ggplot(data = acres_per_year) +
geom_point(aes(x = disc_pre_year,
y = acres_burned,
size = acres_burned,
color = acres_burned)) +
scale_color_continuous(high = "firebrick", low = "goldenrod1") +
labs(x = "Year", y = "Total Acres Burned",
title = "Total acres burned per year from 1990 to 2015") +
theme_minimal() +
theme(legend.position = "none")#most common causes of fire
fire_causes <- us_wildfire_clean %>%
group_by(stat_cause_descr) %>%
count()
#cause (finalized)
ggplot(data = fire_causes, aes(y = reorder(stat_cause_descr, n), x = n)) +
geom_col(aes(fill = n)) +
scale_fill_gradient(high = "firebrick", low = "goldenrod1") +
labs(x = "Number of Fires",
y = "Cause",
tite = "Number of fires per listed starting cause") +
theme_minimal() +
theme(legend.position = "none")us_wildfire_clean$class_fac = factor(us_wildfire_clean$fire_size_class, levels = c("A", "B", "C", "D", "E", "F", "G"))
us <- map_data("world", 'usa')
state <- map_data("state")
state$region2 = "NorthEast"
state[which(state$region %in% c("alaska", "washington", "oregon", "california", "nevada", "idaho", "montana", "utah", "wyoming", "colorado")), "region2"] = "West"
state[which(state$region %in% c("arizona", "new mexico", "oklahoma", "texas")), "region2"] = "SouthWest"
state[which(state$region %in% c("north dakota", "south dakota", "nebraska", "kansas", "minnesota", "iowa", "missouri", "wisconsin", "illinois", "indiana", "michigan", "ohio")), "region2"] = "MidWest"
state[which(state$region %in% c("arkansas", "louisiana", "mississippi", "alabama", "florida", "georgia", "south carolina", "north carolina", "tennessee", "kentucky", "virginia", "west virginia")), "region2"] = "SouthEast"
#state$region = as.factor(state$region)
ggplot(data=state, aes(x=long, y=lat, group = region)) +
geom_polygon(aes(fill=region2)) +
ggtitle("US Region")+
guides(fill=guide_legend(title="Region"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean, aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear < 1970),], aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution before 1970")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$wstation_byear < 2000),], aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution 1970-2000")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 200),], aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution after 2000")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear <= 1970 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
alpha=.3,
colour="dodgerblue", fill="dodgerblue") +
geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$fire_size > 100 & us_wildfire_clean$wstation_byear < 2000),], aes(x = fire_size, y=..density..),
alpha=.3,
colour="yellow3", fill="yellow3") +
geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 2000 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
alpha=.3,
colour="firebrick3", fill="firebrick3") +
xlim(10000, 100000) +
ggtitle("Wildfire Severeity")#missing data plot
aggr_plot <- aggr(us_wildfire_clean,
col = c('navyblue','red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(us_wildfire_clean),
cex.axis = .7,
gap = 2,
ylab = c("Histogram of missing data","Pattern"))##
## Variables sorted by number of missings:
## Variable Count
## fire_name 0.50282019
## hum_cont 0.42786638
## wind_cont 0.41558884
## temp_cont 0.41002139
## prec_cont 0.41002139
## vegetation_classed 0.17480307
## hum_pre_7 0.14329476
## hum_pre_15 0.12783234
## wind_pre_7 0.12250802
## temp_pre_7 0.11173782
## prec_pre_7 0.11173782
## wind_pre_15 0.10663231
## temp_pre_15 0.09571623
## prec_pre_15 0.09571623
## hum_pre_30 0.09413595
## wind_pre_30 0.07308179
## temp_pre_30 0.06216571
## prec_pre_30 0.06216571
## fire_size 0.00000000
## fire_size_class 0.00000000
## stat_cause_descr 0.00000000
## latitude 0.00000000
## longitude 0.00000000
## state 0.00000000
## disc_pre_year 0.00000000
## wstation_usaf 0.00000000
## dstation_m 0.00000000
## wstation_wban 0.00000000
## wstation_byear 0.00000000
## wstation_eyear 0.00000000
## vegetation 0.00000000
## fire_mag 0.00000000
## weather_file 0.00000000
## remoteness 0.00000000
## region 0.00000000
## year_diff 0.00000000
## season 0.00000000
## class_fac 0.00000000
#It is likely that weather columns are correlated. to investigate this we create a correlation matrix
#create a dataframe of weather
weather <- us_wildfire_clean %>%
dplyr::select(temp_pre_30:prec_cont)
#create a correlation matrix (omitting NAs)
cor_matrix <- cor(weather, use = "complete.obs")
#create a visualization
corrplot(cor_matrix, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)#we can see that their is a strong correlation with each set of variables. i.e. temperature 7 days before the fire is correlated with temp 30 days before the fire.
#consider PCA on wind, PCA on hum, PCA on tempWe split our data into 80% training and 20% testing data. Because fire size is heavily skewed towards smaller fires, we used stratified sampling.
There are xyz observations in the training set and xyz observations in the test set.
#first we make a dataframe containing only variables we want to use in our models
fire_modeling_df <- us_wildfire_clean %>%
dplyr::select(-fire_name, #remove fire name
-fire_size_class, #remove fire size class
-class_fac, #which is also a size class
-state, #because we have regions
-disc_pre_year, #because we have year_diff to represent year
-wstation_usaf, #remove weather station name
-wstation_wban, #remove this (not described in codebook)
-wstation_byear, #remove station year installed
-wstation_eyear, #remove station year ended data recording
-weather_file, #remove name of weather file
-dstation_m, #remove distance of weather station to fire
-vegetation #remove vegetaiton because we already have it classed
)
#define split parameters
us_wildfire_split <- fire_modeling_df %>%
initial_split(prop = 0.8, strata = "fire_size")
#write split data to data frames
fire_train <- training(us_wildfire_split)
fire_test <- testing(us_wildfire_split)
#set up folds in training data for cross validation
train_folds <- vfold_cv(fire_train, v = 10, repeats = 5)#select weather cols
weather_train <- fire_train %>%
select(temp_pre_30:prec_cont)
weather_test <- fire_test %>%
select(temp_pre_30:prec_cont)
#select cols not in weather
notweather_train <- fire_train %>%
select(-colnames(weather_train))
notweather_test <- fire_test %>%
select(-colnames(weather_test))
#imputation of weather data
weather_train_imputed <- imputePCA(weather_train, ncp=4)
weather_test_imputed <- imputePCA(weather_test, ncp=4)#Do PCA on both test and train data separately
weather_train_PCA <- weather_train_imputed$completeObs %>%
scale(center = T, scale = T) %>% #first scale and center the data
prcomp(rank. = 4) #do PCA
#weather_train_PCA$x
#Do PCA on both test and train data separately
weather_test_PCA <- weather_test_imputed$completeObs %>%
scale(center = T, scale = T) %>% #first scale and center the data
prcomp(rank. = 4)
#Make a PCA bi-plot
autoplot(weather_train_PCA,
data = weather_train,
loadings = TRUE,
loadings.label = TRUE,
loadings.label.hjust = 1.2) +
theme_classic() +
labs(caption = "Principle Component Analysis Bi-plot of Weather Training Data")#put data back together
#bind imputed weather data to rest of rows
fire_train_complete <- cbind(notweather_train, weather_train_PCA$x) %>%
na.omit()
fire_test_complete <- cbind(notweather_test, weather_test_PCA$x) %>%
na.omit()summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete))##
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11535 -2132 -972 198 527469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 900.96 183.42 4.912 0.0000009073 ***
## PC1 -946.90 34.34 -27.570 < 0.0000000000000002 ***
## PC2 -83.98 37.23 -2.256 0.0241 *
## PC3 91.64 43.42 2.111 0.0348 *
## PC4 -717.13 52.60 -13.635 < 0.0000000000000002 ***
## year_diff 66.81 12.10 5.522 0.0000000338 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12580 on 27223 degrees of freedom
## Multiple R-squared: 0.03419, Adjusted R-squared: 0.03402
## F-statistic: 192.8 on 5 and 27223 DF, p-value: < 0.00000000000000022
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "SouthWest"),]))##
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region ==
## "SouthWest"), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9453 -2429 -859 516 529317
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -497.17 526.14 -0.945 0.34473
## PC1 -615.87 83.10 -7.411 0.000000000000142 ***
## PC2 -598.26 85.69 -6.981 0.000000000003224 ***
## PC3 -38.79 210.87 -0.184 0.85404
## PC4 -684.99 163.45 -4.191 0.000028186488753 ***
## year_diff 100.09 30.76 3.254 0.00115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13990 on 6320 degrees of freedom
## Multiple R-squared: 0.02996, Adjusted R-squared: 0.02919
## F-statistic: 39.04 on 5 and 6320 DF, p-value: < 0.00000000000000022
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "SouthEast"),]))##
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region ==
## "SouthEast"), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2524 -358 -215 -50 308819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.311 90.634 0.500 0.61713
## PC1 -83.689 31.245 -2.679 0.00740 **
## PC2 -32.851 22.389 -1.467 0.14232
## PC3 -49.264 18.405 -2.677 0.00745 **
## PC4 119.955 31.773 3.775 0.00016 ***
## year_diff 18.995 6.097 3.116 0.00184 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4163 on 12414 degrees of freedom
## Multiple R-squared: 0.003414, Adjusted R-squared: 0.003013
## F-statistic: 8.505 on 5 and 12414 DF, p-value: 0.00000004755
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "MidWest"),]))##
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region ==
## "MidWest"), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3260 -1047 -549 -22 81411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1050.411 237.361 4.425 0.0000101 ***
## PC1 -480.255 54.636 -8.790 < 0.0000000000000002 ***
## PC2 -18.515 45.774 -0.404 0.685888
## PC3 41.233 51.842 0.795 0.426487
## PC4 -254.899 76.985 -3.311 0.000943 ***
## year_diff 5.654 13.775 0.410 0.681510
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4177 on 2436 degrees of freedom
## Multiple R-squared: 0.033, Adjusted R-squared: 0.03102
## F-statistic: 16.63 on 5 and 2436 DF, p-value: 0.0000000000000003531
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "West"),]))##
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region ==
## "West"), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -17945 -7819 -4703 -1010 495921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 963.14 906.89 1.062 0.28828
## PC1 -1335.93 205.11 -6.513 0.0000000000819 ***
## PC2 693.22 234.57 2.955 0.00314 **
## PC3 423.89 596.90 0.710 0.47765
## PC4 -86.46 396.12 -0.218 0.82724
## year_diff 277.22 56.64 4.895 0.0000010191622 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24940 on 4346 degrees of freedom
## Multiple R-squared: 0.02072, Adjusted R-squared: 0.01959
## F-statistic: 18.39 on 5 and 4346 DF, p-value: < 0.00000000000000022
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "NorthEast"),]))##
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region ==
## "NorthEast"), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.0 -69.2 -34.2 1.7 9065.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.15430 34.58647 2.115 0.034567 *
## PC1 -35.91001 9.55715 -3.757 0.000178 ***
## PC2 0.87420 6.41421 0.136 0.891608
## PC3 -9.31642 4.91710 -1.895 0.058303 .
## PC4 -2.26750 6.23670 -0.364 0.716223
## year_diff -0.06306 2.07576 -0.030 0.975767
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 441.2 on 1683 degrees of freedom
## Multiple R-squared: 0.0118, Adjusted R-squared: 0.008864
## F-statistic: 4.019 on 5 and 1683 DF, p-value: 0.001252
KNN (short for K Nearest Neighbor) is a non-linear algorithm that works for both classification and regression problems. The basic premise behind the model is that it predicts the dependent variable based on how similar they are to other observations where dependent variable is known.
KNN works by calculating euclidean distance between observations, so all inputs have to be numeric. Therefore, we have to do some pre-model data changes to our training and test sets. For both test data and training data we dummy code categorical variables, then we center and scale the data. As before, we assume that the training and test data are completely separated, so we process the two data sets separately.
#first we dummy code categorical variables (what about ordinal?)
knn_ready_train <- dummy_cols(fire_train_complete, select_columns = c("stat_cause_descr", "region", "vegetation_classed", "season")) %>%
#then we get rid of non-dummy coded variables
select(-stat_cause_descr, - region, - vegetation_classed, -season) %>%
#then convert back to a data frame (as output for `dummy_cols` is a matrix)
as.data.frame()
#then we center and scale the data, except our outcome
knn_ready_train[,-1] <- scale(knn_ready_train[,-1], center = T, scale = T)
#of course our next step is to do the same to the test data
knn_ready_test <- dummy_cols(fire_test_complete, select_columns = c("stat_cause_descr", "region", "vegetation_classed", "season")) %>%
select(-stat_cause_descr, - region, - vegetation_classed, -season) %>%
as.data.frame()
knn_ready_test[,-1] <- scale(knn_ready_test[,-1], center = T, scale = T)Next we split up the training and test data into a data frames that have only independent variables and dependent variables (in this case fire_size). We do this for both the test and the training data.
#YTrain is the true values for fire size on the training set
YTrain = knn_ready_train$fire_size
#XTrain is the design matrix for training data
XTrain = knn_ready_train %>%
select(-fire_size)
#YTest is the true value for fire_size on the test set
YTest = knn_ready_test$fire_size
#Xtest is the design matrix for test data
XTest = knn_ready_test %>%
select(-fire_size)Then we use leave one out cross validation to determine the best number of neighbors to consider. A low number of neighbors considered results in a highly flexible model, while a higher number results in a less flexible model. For this process we built a function which we enter a starting value of K, an ending value of K, and the sampling interval. The result is a data frame of MSE values for each value of K that we test. This process is computationally intensive so we automatically saved results to a csv.
#make a function that saves KNN LOOCV results for different values of K
knn_loocv <- function(startk, endk, interval)
{
#set up possible number of nearest neighbors to be considered
allK = seq(from = startk, to = endk, by = interval)
#create a vector of the same length to save the MSE in later
k_mse = rep(NA, length(allK))
#for each number in allK, use LOOCV to find a validation error
for (i in allK){
#loop through different number of neighbors
#predict on the left-out validation set
pred.Yval = knn.cv(train = XTrain, cl = YTrain, k = i)
#find the mse for each value of k
k_mse[i] = mean((as.numeric(pred.Yval) - YTrain)^2)
}
#save as a data frame and filter out NAs (caused by skipping values of k if interval is larger than 1)
k_mse <- as.data.frame(k_mse) %>%
filter(!is.na(k_mse))
#bind with k value
knn_loocv_results <- cbind(as.data.frame(allK), k_mse)
#save MSE as CSV (because the cross validation process takes a long time)
write_csv(knn_loocv_results, paste0("model_results/knn/knn_mse_k", startk,"_k", endk, "_by", interval, ".csv"))
}
#we tried several different sets of k
knn_loocv(startk = 1, endk = 20, interval = 1)
knn_loocv(startk = 1, endk = 500, interval = 20)
knn_loocv(startk = 481, endk = 490, interval = 1)Next, we go through our results for all the values of K that we tested. We find that the MSE increases as K increases
#read in all the stored k values
knn_mse_k1_k500_by20 <- read_csv(here("model_results", "knn", "knn_mse_k1_k500_by20.csv"))
knn_mse_k1_k20_by1 <- read_csv(here("model_results", "knn", "knn_mse_k1_k20_by1.csv"))
#plot MSE for values of k
plot(knn_mse_k1_k500_by20$allK, knn_mse_k1_k500_by20$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 500 at intervals of 20")When we look at K 1-20 we get the lowest MSE at around 3 before it starts increasing.
plot(knn_mse_k1_k20_by1$allK, knn_mse_k1_k20_by1$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 20 at intervals of 1")Just to confirm, we will print out the best number of neighbors.
#best number of neighbors
numneighbor = max(knn_mse_k1_k20_by1$allK[knn_mse_k1_k20_by1$k_mse == min(knn_mse_k1_k20_by1$k_mse)])
#print best number of neighbors
numneighbor## [1] 3
Now that we’ve found the best number of nearest neighbors to consider, we try out KNN on our test data! Below is the test MSE and the test RMSE.
#fit KNN model with best neighbor value to test data
pred.YTest = knn(train = XTrain, test = XTest, cl = YTrain, k = numneighbor)
#predict KNN
test_MSE <- mean((as.numeric(pred.YTest) - YTest)^2)
#print test MSE
test_MSE## [1] 83563638
#print test RMSE
sqrt(test_MSE)## [1] 9141.315
Over all, this model performs okay.
#train boosted tree model
fire_size_boost = gbm(fire_size~.,
data = fire_train,
n.trees = 500,
interaction.depth = 4
)
#the model summary creates a pretty visualization
summary(fire_size_boost)
#calculate training error
#predict values using training data
predictions_train_boost <- predict(fire_size_boost, data = fire_train)
#calculate rmse for training data
RMSE(predictions_train_boost, fire_train$fire_size)
#calculate test error
#predict values using test data
predictions_test_boost <- predict(fire_size_boost, data = fire_test)
#calculate rmse for training data
RMSE(predictions_test_boost, fire_test$fire_size)#train model
fire_size_rf = randomForest(fire_size ~ ., #writing the formula
data = fire_train_complete, #specifying training data to be used
mtry = 9, #setting number of variables to randomly sample per each split
ntree= 500, #setting number of trees
mode = "regression", #specifying regression
na.action = na.omit, #specifying what to do with NAs
importance = TRUE #specifying importance of variables should be assessed
)
#plot error
plot(fire_size_rf)
#plot variable importance
varImpPlot(fire_size_rf,
sort = T,
main = "Variable Importance for fire size random forest model",
n.var = 5)
#calculate training error
#predict values using training data
predictions_train_rf <- predict(fire_size_rf, data = fire_train)
#calculate rmse for training data
RMSE(predictions_train_rf, fire_train$fire_size)
#calculate test error
#predict values using test data
predictions_test_rf <- predict(fire_size_rf, data = fire_test)
#calculate rmse for training data
RMSE(predictions_test_rf, fire_test$fire_size)what type of modeling will we do? we must try 4 types
predictions without climate change
How do predictions change with 2.5C increase in temp?